#DO NOT RUN#

# Code used to generate the topics. This code was run with the three batches of 
# essays we used in our analysis: the merged essays, significant challenge 
# essays, and the creative side essays.

# Libraries
library('stm')
library("tidytext")
library('lda')
library("quanteda")
library("dplyr")
library("ggplot2")

#This code is looking at private school essays and student level characteristics.
text <- read.csv("second_wave_2017.csv", header = TRUE)
text <- read.csv("second_wave_merged_only_2017.csv")
text <- text[-c(22997), ]

# Only essays with more than 50 characters and reported household income. 

text <- text %>%
  filter(nchar(as.character(Essay)) > 50)

text <- text %>%
  filter(!is.na(FAMILY_INCOME))


#Line belows converts essays into text vectors to be used later. 
#Adds space after periods, commas, and dashes to catch typos; removes numbers 
raw_essay_text <- as.character(text$Essay)
raw_essay_text <- gsub(".",". ", raw_essay_text,
                             fixed = TRUE)
raw_essay_text <- gsub(",",", ", raw_essay_text,
                             fixed = TRUE)
raw_essay_text <- gsub("[0-9]+", " ", 
                             raw_essay_text)

text$Essay <- raw_essay_text

#Checks for unicode/UTF-8 errors
grep("I_WAS_NOT_ASCII", iconv(text$Signif_Chal, "latin1", "ASCII", 
                              sub="I_WAS_NOT_ASCII"))

#Processes data. Removes standard list of stopwords and punctuation, stems all 
# words, and lowercases all characters. Then, create a document-feature matrix.
# Then, convert the document-frame matrix into a readable format for the STM 
# library.
text_dfm <- dfm(as.character(text$Essay), 
                remove =  stopwords("english"), remove_punct = TRUE,
                stem = TRUE, verbose = FALSE, tolower = TRUE)

processed_text<- convert(text_dfm, to = "stm", 
                                   docvars = data.frame(text))

#Lines below prepare corpus for processing and analysis
processed_out <- prepDocuments(processed_text$documents, 
                             processed_text$vocab,
                             processed_text$meta)
processed_docs <- processed_out$documents
processed_vocab <- processed_out$vocab
processed_meta <- processed_out$meta

#Actual STM models in each line below. STM without covariates defaults to 
# correlated topic models (CTM), which we use for our analysis.
set.seed(1993)
processed_fit <- stm(processed_docs, processed_vocab, K = 50,
                   data = processed_meta,
                   max.em.its = 9000, init.type = "LDA", verbose = TRUE)

# Print top terms for each of the K topics. We use these to label the topics.
labelTopics(processed_Fit)


# Use tidyverse to organize the gamma scores for each topic for each document.
# Creates rows for each document with each of the K topics as columns.

library(tidyverse)
gamma <- tidy(processed_Fit, matrix = "gamma")

gamma <- gamma %>%
  spread(key = topic, value = gamma)


# Apply topic labels we created to the gamma dataframe
#cols <- c("document", "Topic 1", "Topic 2"..."Topic K")
colnames(gamma) <- cols

# Merge the gamma columns with other metadata for analysis
essays_w_gamma <- cbind(processed_meta,gamma)

# Create new CSV with other metadata, gamma scores, and essays. 
#write.csv(essays_w_gamma, "essays_w_gammas.csv")
